package crawler;

import org.apache.log4j.BasicConfigurator;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import pipeline.MyPipeLine;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
import us.codecraft.xsoup.Xsoup;
import utils.Header;

import java.io.IOException;
import java.util.*;

/**
 * @author xbl
 * @date 2018/9/16 13:23
 */
public class MeizituCrawler implements PageProcessor {

    private static final String START_URL = "http://www.mzitu.com/";

    private static final String POST_URL = "http://www.mzitu.com/\\d+";

    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);

    @Override
    public void process(Page page) {
        if (START_URL.equals(page.getUrl().toString())){
            List<String> albumUrlList = page.getHtml().xpath("//ul[@id='pins']/li/a/@href").all();
            String nextUrl = page.getHtml().xpath("//a[@class='next page-numbers']/@href").toString();
            String nowUrl = page.getUrl().toString();
            //下一页，列表页
            while (nextUrl!=null && nextUrl!=""){
                Document document ;
                try {
                    document = Jsoup.connect(nextUrl).headers(Header.header).header("Referer",nowUrl).get();
                    List<String> nextList = Xsoup.select(document, "//ul[@id='pins']/li/a/@href").list();
                    nextList.forEach(s -> albumUrlList.add(s));
                    nowUrl = nextUrl;
                    nextUrl = Xsoup.select(document, "//a[@class='next page-numbers']/@href").get();

                } catch (IOException e) {
                    e.printStackTrace();
                }

            }
            page.addTargetRequests(albumUrlList);
        }else if (page.getUrl().regex(POST_URL).match()){
            String title = page.getHtml().xpath("//h2/text()").get();
            Set<String> imgUrls = new HashSet<>();
            List<String> nowUrls ;
            //图片地址（一页显示全部图片）
            List<String> imgUrlList = page.getHtml().xpath("//div[@class='main-image']/p/a/img/@src").all();
            if (imgUrlList.size()>1){
                List<String> fornowUrls = new ArrayList<>();
                imgUrlList.forEach(s -> {
                    imgUrls.add(s);
                    fornowUrls.add(page.getUrl().toString());
                });
                nowUrls = fornowUrls;
            }else{
                //图片地址（一页一张图）
                String nowUrl = page.getUrl().toString();
                String imgUrl = imgUrlList.get(0);
                imgUrls.add(imgUrl);
                //获取图片下一页地址
                List<String> urls = page.getHtml().xpath("//div[@class='pagenavi']/a/@href").all();
                List<String> nums = page.getHtml().xpath("//div[@class='pagenavi']/a/span/text()").all();
                HashMap<String , String> urlsMap = new HashMap<>();
                for (int i = 0; i < urls.size(); i++) {
                    urlsMap.put(nums.get(i),urls.get(i));
                }
                String nextUrl = urlsMap.get("下一页»");
                nowUrls = new ArrayList<>();
                nowUrls.add(nowUrl);
                while (nextUrl!=null){
                    Document document ;
                    try {
                        document = Jsoup.connect(nextUrl).headers(Header.header).header("Referer",nowUrl).get();
                        //获取图片地址
                        imgUrl = Xsoup.select(document, "//div[@class='main-image']/p/a/img/@src").get();
                        imgUrls.add(imgUrl);
                        urls = Xsoup.select(document, "//div[@class='pagenavi']/a/@href").list();
                        nums = Xsoup.select(document, "//div[@class='pagenavi']/a/span/text()").list();
                        urlsMap.clear();
                        //这里是判断是否有下一页，直接用了，不提出来了
                        for (int i = 0; i < urls.size(); i++) {
                            urlsMap.put(nums.get(i),urls.get(i));
                        }
                        nowUrl = nextUrl;
                        nowUrls.add(nowUrl);
                        nextUrl = urlsMap.get("下一页»");
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
            page.putField("imgs",imgUrls);
            page.putField("title",title);
            page.putField("nowUrls",nowUrls);
        }
    }
    @Override
    public Site getSite() {
        Header.header.forEach((key, value) ->site.addHeader(key,value));
        return site;
    }

    public static void main(String[] args) {
        BasicConfigurator.configure(); //自动快速地使用缺省Log4j环境。
        MyPipeLine pipeLine = new MyPipeLine();
        MeizituCrawler meizituCrawler = new MeizituCrawler();
        Spider.create(meizituCrawler).addPipeline(pipeLine).setScheduler(new FileCacheQueueScheduler("D:\\迅雷下载\\妹子图")).addUrl(START_URL).start();
    }
}
